Used libraries
library(tidyverse)
library(knitr)
df <- read_csv("C:/Users/admin/Downloads/Drive Data/train_data.csv")
Train data dimensions:
dim(df[2:18])
## [1] 1000000 17
Train data variable summary
summary(df[2:18]) %>%
kable()
| id | y | amount_current_loan | term | credit_score | loan_purpose | yearly_income | home_ownership | bankruptcies | years_current_job | monthly_debt | years_credit_history | months_since_last_delinquent | open_accounts | credit_problems | credit_balance | max_open_credit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 1 | Min. :0.0 | Min. : 10802 | Length:1000000 | Length:1000000 | Length:1000000 | Min. : 76627 | Length:1000000 | Min. :0.0000 | Min. : 0.00 | Min. : 0 | Min. : 4.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.0000 | Min. : 0 | Min. :0.000e+00 | |
| 1st Qu.: 250001 | 1st Qu.:0.0 | 1st Qu.:174394 | Class :character | Class :character | Class :character | 1st Qu.: 825797 | Class :character | 1st Qu.:0.0000 | 1st Qu.: 3.00 | 1st Qu.: 10324 | 1st Qu.:13.0 | 1st Qu.: 16.0 | 1st Qu.: 8.00 | 1st Qu.: 0.0000 | 1st Qu.: 113392 | 1st Qu.:2.700e+05 | |
| Median : 500001 | Median :0.5 | Median :269676 | Mode :character | Mode :character | Mode :character | Median : 1148550 | Mode :character | Median :0.0000 | Median : 6.00 | Median : 16319 | Median :17.0 | Median : 32.0 | Median :10.00 | Median : 0.0000 | Median : 210539 | Median :4.600e+05 | |
| Mean : 500001 | Mean :0.5 | Mean :316659 | NA | NA | NA | Mean : 1344805 | NA | Mean :0.1192 | Mean : 5.88 | Mean : 18550 | Mean :18.1 | Mean : 34.9 | Mean :11.18 | Mean : 0.1762 | Mean : 293847 | Mean :7.367e+05 | |
| 3rd Qu.: 750000 | 3rd Qu.:1.0 | 3rd Qu.:435160 | NA | NA | NA | 3rd Qu.: 1605899 | NA | 3rd Qu.:0.0000 | 3rd Qu.:10.00 | 3rd Qu.: 24059 | 3rd Qu.:22.0 | 3rd Qu.: 51.0 | 3rd Qu.:14.00 | 3rd Qu.: 0.0000 | 3rd Qu.: 367422 | 3rd Qu.:7.674e+05 | |
| Max. :1000000 | Max. :1.0 | Max. :789250 | NA | NA | NA | Max. :165557393 | NA | Max. :7.0000 | Max. :10.00 | Max. :435843 | Max. :70.0 | Max. :176.0 | Max. :76.00 | Max. :15.0000 | Max. :32878968 | Max. :1.540e+09 | |
| NA | NA | NA | NA | NA | NA | NA’s :219439 | NA | NA’s :1805 | NA’s :45949 | NA | NA | NA’s :529539 | NA | NA | NA | NA’s :27 |
df$loan_purpose <- as.factor(df$loan_purpose)
df$y <- as.factor(df$y)
Summary of character variable - Loan purpose
df %>%
group_by(loan_purpose) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
kable()
| loan_purpose | n |
|---|---|
| debt_consolidation | 785428 |
| other | 91481 |
| home_improvements | 57517 |
| business_loan | 17756 |
| buy_a_car | 11855 |
| medical_bills | 11521 |
| buy_house | 6897 |
| take_a_trip | 5632 |
| major_purchase | 3727 |
| small_business | 3242 |
| moving | 1548 |
| vacation | 1166 |
| wedding | 1129 |
| educational_expenses | 992 |
| renewable_energy | 109 |
df %>%
group_by(y, loan_purpose) %>%
summarise(n = n()) %>%
ggplot(aes(fill=y, y=n, x=loan_purpose)) +
geom_bar(position="dodge", stat="identity") +
coord_flip() +
scale_y_continuous(labels = scales::comma) +
theme_dark()
Main reasons for taking out a loan:
df %>%
filter(y == 1) %>%
group_by(loan_purpose) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(10) %>%
kable()
| loan_purpose | n |
|---|---|
| debt_consolidation | 391875 |
| other | 44888 |
| home_improvements | 27274 |
| business_loan | 10356 |
| medical_bills | 6286 |
| buy_a_car | 5810 |
| buy_house | 3652 |
| take_a_trip | 2870 |
| small_business | 2152 |
| major_purchase | 2120 |
The number of missing values in each column
na_count <- colSums(is.na(df), na.rm = TRUE)
na_count[na_count > 0]
## credit_score yearly_income
## 314333 219439
## bankruptcies years_current_job
## 1805 45949
## months_since_last_delinquent max_open_credit
## 529539 27
Graphs about Loan purpose for further analysis
library(DT)
df %>%
group_by(y, loan_purpose) %>%
summarise(n = n()) %>%
datatable()
library(plotly)
df %>%
group_by(y, credit_score) %>%
summarise(n = n()) %>%
plot_ly(x = ~credit_score, y = ~n, name = ~y, type = "bar")